In [2]:
%matplotlib qt4
from __future__ import division
import math
from models import tools, optimize, models, filters
from models.tests import PerformanceTest
import numpy as np
import pandas as pd
import sklearn as sk
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
from cycler import cycler
sns.set_style("ticks", {"legend.frameon": True})
mpl.rcParams['text.usetex'] = False
mpl.rcParams['text.latex.unicode'] = False
mpl.rcParams['axes.prop_cycle'] = cycler('color', ['#02A5F4', 'orange', 'green'])
In [3]:
data = tools.load_data(limit=500000, offset=1000000)
data = data[filters.sequentize(data)]
In [4]:
len(data)
Out[4]:
In [4]:
def produce_logger(SuperClass):
class Logger(SuperClass):
def __init__(self, *args, **kwargs):
super(Logger, self).__init__(*args, **kwargs)
self.timing = []
def update(self, answer):
item = self.items[answer.user_id, answer.place_id]
if item.practices:
diff = tools.time_diff(answer.inserted, item.last_inserted)
self.timing += [(diff, answer.is_correct, self.predict(answer))]
super(Logger, self).update(answer)
return Logger
LogPFA = produce_logger(models.PFAModel)
LogPFAExt = produce_logger(models.PFAExt)
LogPFAGong = produce_logger(models.PFAGong)
LogPFAExtTiming = produce_logger(models.PFAExtTiming)
LogPFAExtSpacing = produce_logger(models.PFAExtSpacing)
LogPFAGongTiming = produce_logger(models.PFAGongTiming)
LogPFATiming = produce_logger(models.PFATiming)
LogPFAExtStaircase = produce_logger(models.PFAExtStaircase)
In [6]:
def time_effect_log(t, a=1.8, c=0.123):
return a - c*np.log(t)
def time_effect_div(t, a=2, c=0.2):
return a / (t+1)**c
def time_effect_exp(t, a=1.6, c=0.01):
return a * np.exp(-c * np.sqrt(t))
In [5]:
def chunks(l, n):
for i in xrange(0, len(l), n):
yield l[i:i+n]
def interval_error(timings, interval_size=500, metric=tools.rmse):
answers = sorted(timings, key=lambda p: p[0])
def get_diffs_mean(chunk):
return np.mean([diff for diff, _, _ in chunk])
def get_answers_mean(chunk):
return np.mean([pred - obs for _, obs, pred in chunk])
def get_answers_value(chunk):
predictions = [pred for _, obs, pred in chunk if np.isfinite(pred)]
observation = [obs for _, obs, pred in chunk if np.isfinite(pred)]
return metric(observation, predictions)
return [
(get_diffs_mean(chunk), get_answers_value(chunk))
for chunk in chunks(answers, interval_size)
]
In [8]:
pfa = LogPFA(models.EloModel(), gamma=2, delta=-1)
pfa.train(data)
In [9]:
pfae = LogPFAExt(models.EloModel())
pfae.train(data)
In [165]:
def time_effect_div(t, a=2.5, c=0.17):
return a / (t+1)**c
pfaet = LogPFAExtTiming(models.EloModel(), time_effect_fun=time_effect_log)
pfaet.train(data)
In [7]:
pfag = LogPFAGong(models.EloModel(), decay=0.349, gamma=2.040, delta=-0.11)
pfag.train(data)
In [167]:
def time_effect_div(t, a=1.2, c=0.15):
return a / (t+1)**c
pfagt = LogPFAGongTiming(models.EloModel(), time_effect_fun=time_effect_div)
pfagt.train(data)
In [8]:
pfaest = LogPFAExtStaircase(models.EloModel(),
gamma=2.2, delta=-0.9,
staircase={
(0, 60): 1.43,
(60, 90): 1.17,
(90, 150): 1.01,
(150, 300): 0.93,
(300, 600): 0.82,
(600, 60*30): 0.78,
(60*30, 60*60*3): 0.76,
(60*60*3, 60*60*24): 0.63,
(60*60*24, 60*60*24*5): 0.42,
(60*60*24*5, np.inf): 0.12,
}
)
pfaest.train(data)
In [8]:
def time_effect_log(t, a=1.8, c=0.123):
return a - c*np.log(t)
pfaet1 = LogPFAExtTiming(models.EloModel(), time_effect_fun=time_effect_log)
pfaet1.train(data)
In [9]:
def time_effect_exp(t, a=1.6, c=0.01):
return a * np.exp(-c * np.sqrt(t))
pfaet2 = LogPFAExtTiming(models.EloModel(), time_effect_fun=time_effect_exp)
pfaet2.train(data)
In [10]:
def time_effect_div(t, a=2.608, c=0.241):
return a / (t+1)**c
pfaet3 = LogPFAExtTiming(models.EloModel(), gamma=2.293, delta=-0.664,
time_effect_fun=time_effect_div)
pfaet3.train(data)
In [11]:
def time_effect_log(t, a=0.8, c=0.05):
return a - c*np.log(t)
pfagt1 = LogPFAGongTiming(models.EloModel(), time_effect_fun=time_effect_log)
pfagt1.train(data)
In [12]:
def time_effect_exp(t, a=0.5, c=0.002):
return a * np.exp(-c * np.sqrt(t))
pfagt2 = LogPFAGongTiming(models.EloModel(), time_effect_fun=time_effect_exp)
pfagt2.train(data)
In [7]:
def time_effect_div(t, a=1.2, c=0.15):
return a / (t+1)**c
pfagt3 = LogPFAGongTiming(models.EloModel(), time_effect_fun=time_effect_div)
pfagt3.train(data)
In [6]:
pfagt4 = LogPFATiming(models.EloModel(),
time_effect_good='pow', time_effect_bad='pow',
**{'a': 3.1384736895278618, 'c': 0.19758248174437759,
'b': 5.0679683848943906, 'd': 0.76393889411131488})
pfagt4.train(data)
In [10]:
m1 = pfagt3
m2 = pfagt4
m3 = pfaest
n1 = m1.ABBR + ' pow'
n2 = m2.ABBR + ' pow'
n3 = m3.ABBR + ''
metric = lambda y_true, y_pred: (np.mean(y_true) - np.mean(y_pred))
intervals1 = interval_error(m1.timing, interval_size=12000, metric=metric)
intervals2 = interval_error(m2.timing, interval_size=12000, metric=metric)
intervals3 = interval_error(m3.timing, interval_size=12000, metric=metric)
plt.figure(num=None, figsize=(5, 4), dpi=120)
plt.axhline(0, color='#888888', linestyle='--')
p1 = plt.plot([x[0] for x in intervals1], [x[1] for x in intervals1], 'o-')
p2 = plt.plot([x[0] for x in intervals2], [x[1] for x in intervals2], 'o-')
p3 = plt.plot([x[0] for x in intervals3], [x[1] for x in intervals3], 'o-')
plt.xscale('log')
plt.ylabel('Correctness - Prediction')
plt.xlabel('Time from previous attempt in seconds')
plt.xlim([min([x[0] for x in intervals1]), max([x[0] for x in intervals1])])
plt.ylim([-0.1, 0.1])
legend = plt.legend([p1[0], p2[0], p3[0]], (n1, n2, n3), loc='upper right', prop={'size': 12})
legend.get_frame().set_linewidth(1)
plt.show()
plt.tight_layout()
In [26]:
plots = []
params = [(x, y) for x in [1.1, 1.2, 1.3] for y in [0.08, 0.09]]
intervals = []
metric = lambda y_true, y_pred: (np.mean(y_pred) - np.mean(y_true))
time_effect_template = lambda a, b: (lambda t: a - b*np.log(t))
for time_effect in [time_effect_template(*args) for args in params]:
pfagt = LogPFAGongTiming(models.EloModel(), time_effect_fun=time_effect)
pfagt.train(data)
intervals += [interval_error(pfagt.timing, interval_size=1000, metric=metric)]
print len(intervals), 'done'
for interval in intervals:
plots += [plt.plot([x[0] for x in interval], [x[1] for x in interval], '.-')]
plt.xscale('log')
plt.ylabel('observed - predicted')
plt.xlabel('time from previous attempt (seconds)')
plt.xlim([min([x[0] for x in intervals[0]]) - 20, max([x[0] for x in intervals[0]]) + 100000])
plt.legend([p[0] for p in plots], map(lambda x: 'a={},b={}'.format(*x), params))
plt.show()
In [10]:
data = tools.add_spacing(data)
In [12]:
ranges = [0, 60, 90, 150, 300, 600, 1800, 10800, 86400, 259200, 2592000]
intervals = {i: None for i in zip(ranges, ranges[1:] + [np.inf])}
for interval in intervals.keys():
lower, upper = interval
data_slice = data[(data['spacing'] > lower) & (data['spacing'] < upper)].copy()
if data_slice.empty:
continue
print interval, len(data_slice)
pfaet = models.PFAExtTiming(models.EloModel(), time_effect_fun=lambda t: t/80)
pfaet_test = PerformanceTest(pfaet, data_slice)
pfaet_test.run()
intervals[interval] = pfaet_test.results['train'].off
intervals = sorted([(np.mean(interval), value) for interval, value in intervals.items()], key=lambda x: x[0])
In [19]:
ranges = [0, 60, 90, 150, 300, 600, 1800, 10800, 86400, 259200, 2592000]
intervals = {i: None for i in zip(ranges, ranges[1:] + [np.inf])}
for interval in intervals.keys():
lower, upper = interval
data_slice = data[(data['spacing'] > lower) & (data['spacing'] <= upper)]
if len(data_slice) > 0:
correct = len(data_slice[data_slice['is_correct'] == 1]) / len(data_slice)
intervals[interval] = correct
intervals.pop((2592000, np.inf))
In [20]:
intervals = sorted([(np.mean(interval), value) for interval, value in intervals.items()], key=lambda x: x[0])
plt.plot([x[0] for x in intervals], [x[1] for x in intervals])
plt.xscale('log')
plt.show()
In [18]:
ind = np.arange(len(intervals)) # the x locations for the groups
width = 0.50 # the width of the bars: can also be len(x) sequence
correctness = [intervals[i] * 100 for i in sorted(intervals)]
incorrectness = [(1 - intervals[i]) * 100 for i in sorted(intervals)]
p1 = plt.bar(ind, correctness, width, color='#7FFF24')
p2 = plt.bar(ind, incorrectness, width, color='#ff512e', bottom=correctness)
plt.ylabel('%')
plt.xticks(ind+width/2., ('60 s', '90 s', '150 s', '5 m', '10 m',
'30 m', '3 h', '24 h', '3 d', '30 d'))
plt.yticks(np.arange(0, 101, 10))
plt.legend((p1[0], p2[0]), ('correct', 'incorrect'), loc=4)
plt.show()
In [6]:
items = {}
for _, row in data.iterrows():
index = (row.user_id, row.place_id)
answer = models.Answer(**row.to_dict())
if index in items:
items[index].append(answer)
else:
items[index] = [answer]
In [7]:
ranges = [0, 60, 90, 150, 300, 600, 1800, 10800, 86400, 259200, 2592000]
intervals = zip(ranges, ranges[1:] + [np.inf])
def get_interval(value, list_of_intervals):
for lower, upper in list_of_intervals:
if lower < value <= upper:
return lower, upper
correct_before = {i: [] for i in intervals}
incorrect_before = {i: [] for i in intervals}
for index in items:
answers = sorted(items[index], key=lambda x: x.inserted)
for a1, a2 in zip(answers, answers[1:]):
diff = tools.time_diff(a2.inserted, a1.inserted)
interval = get_interval(diff, intervals)
if interval is None:
continue
if a1.is_correct:
correct_before[interval].append(a2.is_correct)
else:
incorrect_before[interval].append(a2.is_correct)
In [9]:
correct_intervals = {i: np.mean(v) for i, v in correct_before.items()}
incorrect_intervals = {i: np.mean(v) for i, v in incorrect_before.items()}
In [17]:
ind = (np.arange(len(intervals)-1) -1.2) * 1.15 # the x locations for the groups
width = 0.4 # the width of the bars: can also be len(x) sequence
correctness = [correct_intervals[i] * 100 for i in sorted(intervals) if i != (2592000, np.inf)]
incorrectness = [incorrect_intervals[i] * 100 for i in sorted(intervals) if i != (2592000, np.inf)]
p1 = plt.bar(ind-0.25, correctness, width, color='#7FFF24')
p2 = plt.bar(ind+0.25, incorrectness, width, color='#ff512e')
plt.ylabel('%')
plt.xticks(ind+width/2., ('60 s', '90 s', '150 s', '5 m', '10 m',
'30 m', '3 h', '24 h', '3 d', '30 d'))
plt.yticks(np.arange(0, 101, 10))
plt.legend((p2[0], p1[0]), ('incorrect before', 'correct before'), loc=4)
plt.show()
In [ ]: